## This file cleans loads in data from the American Community Survey (ACS), ##
## using the tidycensus package, and prepares it for analysis ##
## Created by Meredith Dost and last run 8/16/2025 ##

# load packages
library(tidycensus)
library(tidyr)

# tell the tidycensus package what your census API key is (and put in quotations)
# census_api_key("", install = TRUE, overwrite=TRUE)

## choosing ACS vars to download ##
vars.10.acs5 <- c("DP05_0001E",# total population
             "DP05_0021E", # population age 65+
             "DP05_0072E", # population white alone, non-Hispanic
             "S1501_C01_002E", # pop 18-24 <HS grad
             "S1501_C01_003E", # pop 18-24 HS grad
             "S1501_C01_007E", # pop 25+ <9th grade
             "S1501_C01_008E", # pop 25+ 9-12 grade
             "S1501_C01_009E", # pop 25+ HS grad
             "S1903_C01_001E") # median HH income
vars.10.acs1 <- c("DP05_0001E",# total population
                  "C27007_004", #Male!!Under 18 years!!With Medicaid/means-tested public coverage
                  "C27007_007", #Male!!18 to 64 years!!With Medicaid/means-tested public coverage
                  "C27007_010", #Male!!65 years and over!!With Medicaid/means-tested public coverage
                  "C27007_014", #Female!!Under 18 years!!With Medicaid/means-tested public coverage
                  "C27007_017", #Female!!18 to 64 years!!With Medicaid/means-tested public coverage
                  "C27007_020") #Female!!65 years and over!!With Medicaid/means-tested public coverage
vars.12 <- c("DP05_0001E",# total population
               "DP05_0021E", # population age 65+
               "DP05_0072E", # population white alone, non-Hispanic
               "S1501_C01_002E", # pop 18-24 <HS grad
               "S1501_C01_003E", # pop 18-24 HS grad
               "S1501_C01_007E", # pop 25+ <9th grade
               "S1501_C01_008E", # pop 25+ 9-12 grade
               "S1501_C01_009E", # pop 25+ HS grad
               "S1903_C01_001E", # median HH income
               "C27007_004", #Male!!Under 18 years!!With Medicaid/means-tested public coverage
               "C27007_007", #Male!!18 to 64 years!!With Medicaid/means-tested public coverage
               "C27007_010", #Male!!65 years and over!!With Medicaid/means-tested public coverage
               "C27007_014", #Female!!Under 18 years!!With Medicaid/means-tested public coverage
               "C27007_017", #Female!!18 to 64 years!!With Medicaid/means-tested public coverage
               "C27007_020") #Female!!65 years and over!!With Medicaid/means-tested public coverage
vars.14 <- c("DP05_0001E",# total population CHECKED ALL 2014
             "DP05_0021E", # population age 65+
             "DP05_0072E", # population white alone, non-Hispanic
             "S1501_C01_002E", # pop 18-24 <HS grad
             "S1501_C01_003E", # pop 18-24 HS grad
             "S1501_C01_007E", # pop 25+ <9th grade
             "S1501_C01_008E", # pop 25+ 9-12 grade
             "S1501_C01_009E", # pop 25+ HS grad
             "S1903_C01_001E", # median HH income
             "C27007_004", #Male!!Under 18 years!!With Medicaid/means-tested public coverage
             "C27007_007", #Male!!18 to 64 years!!With Medicaid/means-tested public coverage
             "C27007_010", #Male!!65 years and over!!With Medicaid/means-tested public coverage
             "C27007_014", #Female!!Under 18 years!!With Medicaid/means-tested public coverage
             "C27007_017", #Female!!18 to 64 years!!With Medicaid/means-tested public coverage
             "C27007_020") #Female!!65 years and over!!With Medicaid/means-tested public coverage
vars.16 <- c("DP05_0001E",# total population CHECKED ALL 2016
             "DP05_0021E", # population age 65+
             "DP05_0072E", # population white alone, non-Hispanic
             "S1501_C01_002E", # pop 18-24 <HS grad
             "S1501_C01_003E", # pop 18-24 HS grad
             "S1501_C01_007E", # pop 25+ <9th grade
             "S1501_C01_008E", # pop 25+ 9-12 grade
             "S1501_C01_009E", # pop 25+ HS grad
             "S1903_C01_001E", # median HH income
             "S2704_C02_023", #Public Coverage!!Estimate!!MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!Under 18
             "S2704_C02_024", #Public Coverage!!Estimate!!MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!18 to 64 years
             "S2704_C02_025") #Public Coverage!!Estimate!!MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!65 years and over
vars.1820 <- c("DP05_0001E",# total population
          "DP05_0029E", # population age 65+
          "DP05_0077E", # population white alone, non-Hispanic
          "DP05_0087E", # citizen voting age pop 18+
          "S1501_C01_002E", # pop 18-24 <HS grad
          "S1501_C01_003E", # pop 18-24 HS grad
          "S1501_C01_007E", # pop 25+ <9th grade
          "S1501_C01_008E", # pop 25+ 9-12 grade
          "S1501_C01_009E", # pop 25+ HS grad
          "S1903_C03_001E", # median HH income 
          "S2704_C01_007E", # Estimate!!Total!!Medicaid/means-tested public coverage alone or in combination!!Under 19
          "S2704_C01_008E", # Estimate!!Total!!Medicaid/means-tested public coverage alone or in combination!!19 to 64 years
          "S2704_C01_009E" )# Estimate!!Total!!Medicaid/means-tested public coverage alone or in combination!!65 years and over
## get data ##
acs10.1yr <- get_acs(geography = "county", 
                     variables = vars.10.acs1, 
                     year = 2010, survey = "acs1")
acs10.5yr <- get_acs(geography = "county", 
                 variables = vars.10.acs5, 
                 year = 2010, survey = "acs5")
acs12 <- get_acs(geography = "county", 
                 variables = vars.12, 
                 year = 2012, survey = "acs5")
acs14 <- get_acs(geography = "county", 
                 variables = vars.14, 
                 year = 2014, survey = "acs5")
acs16 <- get_acs(geography = "county", 
                 variables = vars.16, 
                 year = 2016, survey = "acs5")
acs18 <- get_acs(geography = "county", 
                 variables = vars.1820, 
                 year = 2018, survey = "acs5")
acs20 <- get_acs(geography = "county", 
               variables = vars.1820, 
               year = 2020, survey = "acs5")

## clean data ##
### 2010 start 1-YEAR
acs10.sub <- acs10.1yr
acs10.sub$NAME <- NULL
acs10.sub$moe <- NULL
data_wide <- spread(acs10.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("med_m_u18","med_m_1864","med_m_65p","med_f_u18","med_f_1864","med_f_65p","pop_total")
df2$GEOID <- rownames(df2)
df2$med_tot <- df2$med_m_u18+df2$med_m_1864+df2$med_m_65+df2$med_f_u18+df2$med_f_1864+df2$med_f_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct_med")]
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d10.1yr <- df2
rm(acs10.1yr,acs10.sub,data_wide,df2,vars.10.acs1)
### 2010 end

### 2010 start 5 YEAR DATA
acs10.sub <- acs10.5yr
acs10.sub$NAME <- NULL
acs10.sub$moe <- NULL
data_wide <- spread(acs10.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("pop_total","pop_65plus","pop_whiteNH","hs1","hs2","hs3","hs4","hs5","medincome")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","pop_total")]
df2$year <- 2010
#df2$state <- substr(df2$GEOID, start = 1, stop = 2)
d10.5yr <- df2
rm(acs10.5yr,acs10.sub,data_wide,df2,vars.10.acs5)
### 2010 end

### combining 2010 datasets START
d10.m <- merge(d10.5yr, d10.1yr, by = c("GEOID"), all.x=T)
rm(d10.1yr,d10.5yr)
### combining 2010 datasets END

### 2012 start
acs12.sub <- acs12
acs12.sub$NAME <- NULL
acs12.sub$moe <- NULL
data_wide <- spread(acs12.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("med_m_u18","med_m_1864","med_m_65p","med_f_u18","med_f_1864","med_f_65p",
  "pop_total","pop_65plus","pop_whiteNH","hs1","hs2","hs3","hs4","hs5","medincome")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2$med_tot <- df2$med_m_u18+df2$med_m_1864+df2$med_m_65+df2$med_f_u18+df2$med_f_1864+df2$med_f_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","pct_med","pop_total")]
df2$year <- 2012
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d12 <- df2
rm(acs12,acs12.sub,data_wide,df2,vars.12)
### 2012 end

### 2014 start
acs14.sub <- acs14
acs14.sub$NAME <- NULL
acs14.sub$moe <- NULL
data_wide <- spread(acs14.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("med_m_u18","med_m_1864","med_m_65p","med_f_u18","med_f_1864","med_f_65p",
                   "pop_total","pop_65plus","pop_whiteNH","hs1","hs2","hs3","hs4","hs5","medincome")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2$med_tot <- df2$med_m_u18+df2$med_m_1864+df2$med_m_65+df2$med_f_u18+df2$med_f_1864+df2$med_f_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","pct_med","pop_total")]
df2$year <- 2014
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d14 <- df2
rm(acs14,acs14.sub,data_wide,df2,vars.14)
### 2014 end

### 2016 start
acs16.sub <- acs16
acs16.sub$NAME <- NULL
acs16.sub$moe <- NULL
data_wide <- spread(acs16.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("pop_total","pop_65plus","pop_whiteNH","hs1","hs2","hs3","hs4","hs5","medincome",
                   "med_u18","med_1864","med_65p")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2$med_tot <- df2$med_u18+df2$med_1864+df2$med_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","pct_med","pop_total")]
df2$year <- 2016
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d16 <- df2
rm(acs16,acs16.sub,data_wide,df2,vars.16)
### 2016 end

### 2018 start
acs18.sub <- acs18
acs18.sub$NAME <- NULL
acs18.sub$moe <- NULL
data_wide <- spread(acs18.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("pop_total","pop_65plus","pop_whiteNH","cvap","hs1","hs2","hs3","hs4","hs5","medincome",
                   "med_u18","med_1864","med_65p")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2$l_cvap <- log(df2$cvap)
df2$med_tot <- df2$med_u18+df2$med_1864+df2$med_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","cvap","l_cvap","pct_med","pop_total")]
df2$year <- 2018
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d18 <- df2
rm(acs18,acs18.sub,data_wide,df2)
### 2018 end

### 2020 start
acs20.sub <- acs20
acs20.sub$NAME <- NULL
acs20.sub$moe <- NULL
data_wide <- spread(acs20.sub, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("pop_total","pop_65plus","pop_whiteNH","cvap","hs1","hs2","hs3","hs4","hs5","medincome",
                   "med_u18","med_1864","med_65p")
df2$hs <- df2$hs1 + df2$hs2 + df2$hs3 + df2$hs4 + df2$hs5
df2$GEOID <- rownames(df2)
df2$pct65plus <- df2$pop_65plus/df2$pop_total
df2$pctwhite <- df2$pop_whiteNH/df2$pop_total
df2$pcths <- df2$hs/df2$pop_total
df2$l_medincome <- log(df2$medincome)
df2$l_cvap <- log(df2$cvap)
df2$med_tot <- df2$med_u18+df2$med_1864+df2$med_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2 <- df2[c("GEOID","pct65plus","pctwhite","pcths","medincome","l_medincome","cvap","l_cvap","pct_med","pop_total")]
df2$year <- 2020
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
d20 <- df2
rm(acs20,acs20.sub,data_wide,df2,vars.1820)
### 2020 end

## getting CVAP for pre-2018 ##
setwd("input_data/")
### 2010 start
vap10 <- read.csv("CVAP_2006-2010_ACS_csv_files/County.csv")
vap10 <- subset(vap10, LNTITLE=="Total")
vap10$GEOID <- as.numeric(sub('.*US', '', vap10$GEOID))
vap10$cvap <- as.numeric(vap10$CVAP_EST)
vars <- c("GEOID","cvap")
vap10 <- vap10[vars]
vap10$l_cvap <- log(vap10$cvap)
vap10$year <- 2010
### 2010 end
### 2012 start
vap12 <- read.csv("CVAP_2008-2012_ACS_csv_files/County.csv")
vap12 <- subset(vap12, LNTITLE=="Total")
vap12$GEOID <- as.numeric(sub('.*US', '', vap12$GEOID))
vap12$cvap <- as.numeric(vap12$CVAP_EST)
vars <- c("GEOID","cvap")
vap12 <- vap12[vars]
vap12$l_cvap <- log(vap12$cvap)
vap12$year <- 2012
### 2012 end
### 2014 start
vap14 <- read.csv("CVAP_2010-2014_ACS_csv_files/County.csv")
vap14 <- subset(vap14, LNTITLE=="Total")
vap14$GEOID <- as.numeric(sub('.*US', '', vap14$GEOID))
vap14$cvap <- as.numeric(vap14$CVAP_EST)
vars <- c("GEOID","cvap")
vap14 <- vap14[vars]
vap14$l_cvap <- log(vap14$cvap)
vap14$year <- 2014
### 2014 end
### 2016 start
vap16 <- read.csv("CVAP_2012-2016_ACS_csv_files/County.csv")
vap16 <- subset(vap16, LNTITLE=="Total")
vap16$GEOID <- as.numeric(sub('.*US', '', vap16$GEOID))
vap16$cvap <- as.numeric(vap16$CVAP_EST)
vars <- c("GEOID","cvap")
vap16 <- vap16[vars]
vap16$l_cvap <- log(vap16$cvap)
vap16$year <- 2016
### 2016 end
### merging ###
cvap <- rbind.data.frame(vap10, vap12, vap14, vap16)
rm(vars,vap10, vap12, vap14, vap16)
### 2010 start 
cvap10 <- subset(cvap, year==2010)
d10 <- d10.m
d10$GEOID <- as.numeric(d10$GEOID)
d10$year <- NULL
d10 <- merge(d10, cvap10, by = "GEOID")
rm(cvap10,d10.m)
### 2010 end 
### 2012 start 
cvap12 <- subset(cvap, year==2012)
d12$GEOID <- as.numeric(d12$GEOID)
d12$year <- NULL
d12 <- merge(d12, cvap12, by = "GEOID")
rm(cvap12)
### 2012 end 
### 2014 start 
cvap14 <- subset(cvap, year==2014)
d14$GEOID <- as.numeric(d14$GEOID)
d14$year <- NULL
d14 <- merge(d14, cvap14, by = "GEOID")
rm(cvap14)
### 2014 end 
### 2016 start 
cvap16 <- subset(cvap, year==2016)
d16$GEOID <- as.numeric(d16$GEOID)
d16$year <- NULL
d16 <- merge(d16, cvap16, by = "GEOID")
rm(cvap16)
### 2016 end 
rm(cvap)

##### merge #####
d18$GEOID <- as.numeric(d18$GEOID)
d20$GEOID <- as.numeric(d20$GEOID)
acs <- rbind.data.frame(d10,d12,d14,d16,d18,d20)
# clean
names(acs)[1] <- "fips"
acs$pct_med <- NULL
acs$highmed <- NULL
acs$lpop <- log(acs$pop_total)

# saving out data
#setwd("demographic_data/")
write.table(acs, file = "acs_county_data.csv", sep = ",", row.names=F)

